{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install openai\n",
    "!pip install getpass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Descrive the topic of the dataset you want to create\n",
    "\n",
    "\n",
    "# Example Prompts\n",
    "# prompt = \"A model that takes in a puzzle-like reasoning-heavy question in English, and responds with a well-reasoned, step-by-step thought out response in Spanish.\"\n",
    "# prompt = \"A Model that takes in a question on badminton rules and responds with a well-reasoned, step-by-step thought out response.\"\n",
    "# prompt = \"A Model that takes in a question on quantum physics and responds with a well-reasoned, step-by-step thought out response.\"\n",
    "\n",
    "prompt = \"A Model that takes in a question on Tao Science and responds with a well-reasoned, step-by-step thought out response.\"\n",
    "\n",
    "temperature = .4\n",
    "number_of_examples = 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import openai\n",
    "import getpass\n",
    "\n",
    "openai.api_key = getpass.getpass(\"OPEN AI API KEY: \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import random\n",
    "\n",
    "\n",
    "def generate_example(prompt, prev_examples, temperature=.5):\n",
    "    messages=[\n",
    "        {\n",
    "            \"role\": \"system\",\n",
    "            \"content\": f\"You are generating data which will be used to train a machine learning model.\\n\\nYou will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a prompt/response pair.\\n\\nYou will do so in this format:\\n```\\nprompt\\n-----------\\n$prompt_goes_here\\n-----------\\n\\nresponse\\n-----------\\n$response_goes_here\\n-----------\\n```\\n\\nOnly one prompt/response pair should be generated per turn.\\n\\nFor each turn, make the example slightly more complex than the last, while ensuring diversity.\\n\\nMake sure your samples are unique and diverse, yet high-quality and complex enough to train a well-performing model.\\n\\nHere is the type of model we want to train:\\n`{prompt}`\"\n",
    "        }\n",
    "    ]\n",
    "\n",
    "    if len(prev_examples) > 0:\n",
    "        if len(prev_examples) > 10:\n",
    "            prev_examples = random.sample(prev_examples, 10)\n",
    "        for example in prev_examples:\n",
    "            messages.append({\n",
    "                \"role\": \"assistant\",\n",
    "                \"content\": example\n",
    "            })\n",
    "\n",
    "    response = openai.ChatCompletion.create(\n",
    "        model=\"gpt-3.5-turbo\",\n",
    "        messages=messages,\n",
    "        temperature=temperature,\n",
    "        max_tokens=2048,\n",
    "    )\n",
    "\n",
    "    return response.choices[0].message['content']\n",
    "\n",
    "# Generate examples\n",
    "prev_examples = []\n",
    "for i in range(number_of_examples):\n",
    "    print(f'Generating example {i}')\n",
    "    example = generate_example(prompt, prev_examples, temperature)\n",
    "    prev_examples.append(example)\n",
    "\n",
    "print(prev_examples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_system_message(prompt):\n",
    "\n",
    "    response = openai.ChatCompletion.create(\n",
    "        model=\"gpt-4\",\n",
    "        messages=[\n",
    "          {\n",
    "            \"role\": \"system\",\n",
    "            \"content\": \"You will be given a high-level description of the model we are training, and from that, you will generate a simple system prompt for that model to use. Remember, you are not generating the system message for data generation -- you are generating the system message to use for inference. A good format to follow is `Given $INPUT_DATA, you will $WHAT_THE_MODEL_SHOULD_DO.`.\\n\\nMake it as concise as possible. Include nothing but the system prompt in your response.\\n\\nFor example, never write: `\\\"$SYSTEM_PROMPT_HERE\\\"`.\\n\\nIt should be like: `$SYSTEM_PROMPT_HERE`.\"\n",
    "          },\n",
    "          {\n",
    "              \"role\": \"user\",\n",
    "              \"content\": prompt.strip(),\n",
    "          }\n",
    "        ],\n",
    "        temperature=temperature,\n",
    "        max_tokens=500,\n",
    "    )\n",
    "\n",
    "    return response.choices[0].message['content']\n",
    "\n",
    "system_message = generate_system_message(prompt)\n",
    "\n",
    "print(f'The system message is: `{system_message}`. Feel free to re-run this cell if you want a better result.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Initialize lists to store prompts and responses\n",
    "prompts = []\n",
    "responses = []\n",
    "\n",
    "# Parse out prompts and responses from examples\n",
    "for example in prev_examples:\n",
    "  try:\n",
    "    split_example = example.split('-----------')\n",
    "    prompts.append(split_example[1].strip())\n",
    "    responses.append(split_example[3].strip())\n",
    "  except:\n",
    "    pass\n",
    "\n",
    "# Create a DataFrame\n",
    "df = pd.DataFrame({\n",
    "    'prompt': prompts,\n",
    "    'response': responses\n",
    "})\n",
    "\n",
    "# Remove duplicates\n",
    "df = df.drop_duplicates()\n",
    "\n",
    "print('There are ' + str(len(df)) + ' successfully-generated examples. Here are the first few:')\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split the data into train and test sets, with 90% in the train set\n",
    "train_df = df.sample(frac=0.9, random_state=42)\n",
    "test_df = df.drop(train_df.index)\n",
    "\n",
    "# Save the dataframes to .jsonl files\n",
    "train_df.to_json('train.jsonl', orient='records', lines=True)\n",
    "test_df.to_json('test.jsonl', orient='records', lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "323aeb29f29747a19aea45fffc93bc8e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating gen split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "ename": "DatasetGenerationError",
     "evalue": "An error occurred while generating the dataset",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mUnicodeDecodeError\u001b[0m                        Traceback (most recent call last)",
      "File \u001b[1;32mc:\\Projects\\LLM-Cookbook\\LLM-venv\\Lib\\site-packages\\datasets\\builder.py:1726\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._prepare_split_single\u001b[1;34m(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)\u001b[0m\n\u001b[0;32m   1725\u001b[0m _time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[1;32m-> 1726\u001b[0m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrecord\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m   1727\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mmax_shard_size\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mand\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mwriter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_num_bytes\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m>\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mmax_shard_size\u001b[49m\u001b[43m:\u001b[49m\n",
      "File \u001b[1;32m~\\.cache\\huggingface\\modules\\datasets_modules\\datasets\\ai4bharat--IN22-Gen\\7fd712b9463756101b749cf33dd7de98cb1e1b78ee66403f65ca3894ad5e4147\\IN22-Gen.py:179\u001b[0m, in \u001b[0;36mIN22Gen._generate_examples\u001b[1;34m(self, sentence_paths, metadata_path, langs)\u001b[0m\n\u001b[0;32m    178\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m sent_file:\n\u001b[1;32m--> 179\u001b[0m         sentences[lang] \u001b[38;5;241m=\u001b[39m [l\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m l \u001b[38;5;129;01min\u001b[39;00m \u001b[43msent_file\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadlines\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m]\n\u001b[0;32m    180\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(metadata_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m metadata_file:\n",
      "File \u001b[1;32mC:\\Python311\\Lib\\encodings\\cp1252.py:23\u001b[0m, in \u001b[0;36mIncrementalDecoder.decode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m     22\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, final\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m---> 23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m codecs\u001b[38;5;241m.\u001b[39mcharmap_decode(\u001b[38;5;28minput\u001b[39m,\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39merrors,decoding_table)[\u001b[38;5;241m0\u001b[39m]\n",
      "\u001b[1;31mUnicodeDecodeError\u001b[0m: 'charmap' codec can't decode byte 0x8f in position 2: character maps to <undefined>",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[1;31mDatasetGenerationError\u001b[0m                    Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[2], line 4\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[0;32m      3\u001b[0m \u001b[38;5;66;03m# download and load all the pairs\u001b[39;00m\n\u001b[1;32m----> 4\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mai4bharat/IN22-Gen\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mall\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m      6\u001b[0m \u001b[38;5;66;03m# download and load specific pairs\u001b[39;00m\n\u001b[0;32m      7\u001b[0m \u001b[38;5;66;03m# dataset = load_dataset(\"ai4bharat/IN22-Gen\", \"eng_Latn-hin_Deva\")\u001b[39;00m\n",
      "File \u001b[1;32mc:\\Projects\\LLM-Cookbook\\LLM-venv\\Lib\\site-packages\\datasets\\load.py:2549\u001b[0m, in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[0;32m   2546\u001b[0m try_from_hf_gcs \u001b[38;5;241m=\u001b[39m path \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[0;32m   2548\u001b[0m \u001b[38;5;66;03m# Download and prepare data\u001b[39;00m\n\u001b[1;32m-> 2549\u001b[0m \u001b[43mbuilder_instance\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   2550\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2551\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2552\u001b[0m \u001b[43m    \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverification_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2553\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtry_from_hf_gcs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtry_from_hf_gcs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2554\u001b[0m \u001b[43m    \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_proc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2555\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2556\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2558\u001b[0m \u001b[38;5;66;03m# Build dataset for splits\u001b[39;00m\n\u001b[0;32m   2559\u001b[0m keep_in_memory \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m   2560\u001b[0m     keep_in_memory \u001b[38;5;28;01mif\u001b[39;00m keep_in_memory \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m is_small_dataset(builder_instance\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size)\n\u001b[0;32m   2561\u001b[0m )\n",
      "File \u001b[1;32mc:\\Projects\\LLM-Cookbook\\LLM-venv\\Lib\\site-packages\\datasets\\builder.py:1005\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m   1003\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m num_proc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m   1004\u001b[0m         prepare_split_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_proc\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m num_proc\n\u001b[1;32m-> 1005\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_download_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1006\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1007\u001b[0m \u001b[43m        \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverification_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1008\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_split_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1009\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdownload_and_prepare_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1010\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1011\u001b[0m \u001b[38;5;66;03m# Sync info\u001b[39;00m\n\u001b[0;32m   1012\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(split\u001b[38;5;241m.\u001b[39mnum_bytes \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39msplits\u001b[38;5;241m.\u001b[39mvalues())\n",
      "File \u001b[1;32mc:\\Projects\\LLM-Cookbook\\LLM-venv\\Lib\\site-packages\\datasets\\builder.py:1767\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m   1766\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_download_and_prepare\u001b[39m(\u001b[38;5;28mself\u001b[39m, dl_manager, verification_mode, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mprepare_splits_kwargs):\n\u001b[1;32m-> 1767\u001b[0m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_download_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1768\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1769\u001b[0m \u001b[43m        \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1770\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcheck_duplicate_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverification_mode\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mVerificationMode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mBASIC_CHECKS\u001b[49m\n\u001b[0;32m   1771\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mVerificationMode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mALL_CHECKS\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1772\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_splits_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1773\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Projects\\LLM-Cookbook\\LLM-venv\\Lib\\site-packages\\datasets\\builder.py:1100\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m   1096\u001b[0m split_dict\u001b[38;5;241m.\u001b[39madd(split_generator\u001b[38;5;241m.\u001b[39msplit_info)\n\u001b[0;32m   1098\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m   1099\u001b[0m     \u001b[38;5;66;03m# Prepare split will record examples associated to the split\u001b[39;00m\n\u001b[1;32m-> 1100\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare_split\u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit_generator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_split_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1101\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m   1102\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[0;32m   1103\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot find data file. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   1104\u001b[0m         \u001b[38;5;241m+\u001b[39m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmanual_download_instructions \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m   1105\u001b[0m         \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mOriginal error:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   1106\u001b[0m         \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(e)\n\u001b[0;32m   1107\u001b[0m     ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "File \u001b[1;32mc:\\Projects\\LLM-Cookbook\\LLM-venv\\Lib\\site-packages\\datasets\\builder.py:1605\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._prepare_split\u001b[1;34m(self, split_generator, check_duplicate_keys, file_format, num_proc, max_shard_size)\u001b[0m\n\u001b[0;32m   1603\u001b[0m job_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m   1604\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m pbar:\n\u001b[1;32m-> 1605\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare_split_single\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1606\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgen_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgen_kwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjob_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m_prepare_split_args\u001b[49m\n\u001b[0;32m   1607\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m   1608\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m:\u001b[49m\n\u001b[0;32m   1609\u001b[0m \u001b[43m            \u001b[49m\u001b[43mresult\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\n",
      "File \u001b[1;32mc:\\Projects\\LLM-Cookbook\\LLM-venv\\Lib\\site-packages\\datasets\\builder.py:1762\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._prepare_split_single\u001b[1;34m(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)\u001b[0m\n\u001b[0;32m   1760\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e, SchemaInferenceError) \u001b[38;5;129;01mand\u001b[39;00m e\u001b[38;5;241m.\u001b[39m__context__ \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m   1761\u001b[0m         e \u001b[38;5;241m=\u001b[39m e\u001b[38;5;241m.\u001b[39m__context__\n\u001b[1;32m-> 1762\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m DatasetGenerationError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn error occurred while generating the dataset\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[0;32m   1764\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m job_id, \u001b[38;5;28;01mTrue\u001b[39;00m, (total_num_examples, total_num_bytes, writer\u001b[38;5;241m.\u001b[39m_features, num_shards, shard_lengths)\n",
      "\u001b[1;31mDatasetGenerationError\u001b[0m: An error occurred while generating the dataset"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "# download and load all the pairs\n",
    "dataset = load_dataset(\"facebook/flores\", \"all\")\n",
    "# dataset = load_dataset(\"ai4bharat/IN22-Gen\", \"all\")\n",
    "\n",
    "# download and load specific pairs\n",
    "# dataset = load_dataset(\"ai4bharat/IN22-Gen\", \"eng_Latn-hin_Deva\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error: Invalid \\uXXXX escape: line 9747 column 194 (char 6029311)\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "# Read the JSON file as a string\n",
    "with open(\"input.json\", \"r\", encoding=\"utf-8\") as f:\n",
    "    json_string = f.read()\n",
    "\n",
    "# Define the problematic Unicode escape sequence\n",
    "problematic_sequence = \"\\\\uXXXX\"  # Replace XXXX with the actual problematic sequence\n",
    "\n",
    "# Replace the problematic sequence with an empty string\n",
    "cleaned_json_string = json_string.replace(problematic_sequence, \"\")\n",
    "\n",
    "# Decode the cleaned JSON string\n",
    "try:\n",
    "    data = json.loads(cleaned_json_string)\n",
    "except json.JSONDecodeError as e:\n",
    "    print(\"Error:\", e)\n",
    "    # Handle the error (e.g., log, skip, or raise an exception)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "# Specify the input and output file paths\n",
    "input_file = \"input.json\"\n",
    "output_file = \"output.json\"\n",
    "\n",
    "# Read the JSON file\n",
    "with open(input_file, \"r\", encoding=\"utf-8\") as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "# Write the data back to the output file with UTF-8 encoding\n",
    "with open(output_file, \"w\", encoding=\"utf-8\") as f:\n",
    "    json.dump(data, f, ensure_ascii=False, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset.to_json(\"\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
